{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "from time import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ! wget https://datasets.imdbws.com/name.basics.tsv.gz \n",
    "# ! wget https://datasets.imdbws.com/title.ratings.tsv.gz\n",
    "# ! wget https://datasets.imdbws.com/title.akas.tsv.gz\n",
    "# ! wget https://datasets.imdbws.com/title.basics.tsv.gz\n",
    "# ! wget https://datasets.imdbws.com/title.crew.tsv.gz\n",
    "# ! wget https://datasets.imdbws.com/title.episode.tsv.gz\n",
    "# ! wget https://datasets.imdbws.com/title.principals.tsv.gz\n",
    "# ! wget https://datasets.imdbws.com/title.episode.tsv.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Choose imdb-ids of most popular movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "fpath = \"./title.ratings.tsv.gz\"\n",
    "\n",
    "df_ratings = pd.read_table(fpath)\n",
    "# print(df_ratings.head())\n",
    "\n",
    "df = pd.read_table(\"./title.basics.tsv.gz\", na_values={\"startYear\": [\"\\\\N\"], \"endYear\": [\"\\\\N\"], \"isAdult\": [\"\\\\N\"]})\n",
    "# print(df.head())\n",
    "\n",
    "df = df.merge(df_ratings, on=\"tconst\")\n",
    "\n",
    "#fill start year values\n",
    "df[\"startYear\"] = df[\"startYear\"].fillna(value=df[\"startYear\"])\n",
    "# print(df[\"startYear\"].isna().any())\n",
    "\n",
    "df[\"startYear\"] = df[\"startYear\"].fillna(value=0)\n",
    "# print(df[\"startYear\"].isna().any())\n",
    "\n",
    "df[\"endYear\"] = df[\"endYear\"].fillna(value=df[\"startYear\"])\n",
    "# print(df[\"endYear\"].isna().any())\n",
    "\n",
    "df[\"isAdult\"] = df[\"isAdult\"].fillna(value=0)\n",
    "# print(df[\"isAdult\"].isna().any())\n",
    "\n",
    "df = df.astype(dtype={\"startYear\": np.int32, \n",
    "                 \"endYear\": np.int32, \n",
    "                 \"isAdult\": np.int32})\n",
    "# print(df.head())\n",
    "# print(df[\"titleType\"].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(429971, 11)\n"
     ]
    }
   ],
   "source": [
    "target = [\"movie\", \"tvMovie\", \"tvSeries\", \"tvMiniSeries\"] \n",
    "ind_drop = df[~df['titleType'].isin(target)].index\n",
    "df = df.drop(ind_drop)\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "49880\n",
      "98133\n",
      "Total length of the database: 94733\n"
     ]
    }
   ],
   "source": [
    "collect_movies_based_on_rating = False\n",
    "collect_movies_based_on_numvotes = True\n",
    "\n",
    "if collect_movies_based_on_rating:\n",
    "\n",
    "    movies_ids = []\n",
    "\n",
    "    movies_ids.extend(df.loc[(df[\"startYear\"] <= 1990)  & (df[\"averageRating\"] > 8), \"tconst\"].values)\n",
    "    print(len(movies_ids))\n",
    "\n",
    "    movies_ids.extend(df.loc[(df[\"startYear\"] > 1990) & (df[\"startYear\"] <= 2005) & (df[\"averageRating\"] > 7), \"tconst\"].values)\n",
    "    print(len(movies_ids))\n",
    "\n",
    "    movies_ids.extend(df.loc[(df[\"startYear\"] > 2005) & (df[\"startYear\"] <= 2015) & (df[\"averageRating\"] > 6), \"tconst\"].values)\n",
    "    print(len(movies_ids))\n",
    "\n",
    "    movies_ids.extend(df.loc[(df[\"startYear\"] > 2015) & (df[\"startYear\"] <= 2022) & (df[\"averageRating\"] > 5), \"tconst\"].values)\n",
    "    print(len(movies_ids))\n",
    "\n",
    "    with open(\"all_imdb_ids.txt\", \"w\") as f:\n",
    "        for movie_id in movies_ids:\n",
    "            f.write(str(movie_id) + \"\\n\")\n",
    "\n",
    "if collect_movies_based_on_numvotes:\n",
    "\n",
    "    movies_ids = []\n",
    "\n",
    "    movies_ids.extend(df.loc[df.loc[:, \"numVotes\"] > 1000, \"tconst\"].values)\n",
    "    print(len(movies_ids))\n",
    "\n",
    "    movies_ids.extend(df.loc[df.loc[:, \"averageRating\"] > 8., \"tconst\"].values)\n",
    "    print(len(movies_ids))\n",
    "    \n",
    "    movies_ids = list(set(movies_ids))\n",
    "    print(f\"Total length of the database: {len(movies_ids)}\")\n",
    "\n",
    "    with open(\"most_popular_imdb_ids.txt\", \"w\") as f:\n",
    "        for movie_id in movies_ids:\n",
    "            f.write(str(movie_id) + \"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of considered movies: 94733\n"
     ]
    }
   ],
   "source": [
    "with open(\"most_popular_imdb_ids.txt\", \"r\") as f:\n",
    "    all_movies_ids = f.read().splitlines()\n",
    "    \n",
    "all_movies_ids = list(set(all_movies_ids))\n",
    "print(f\"Total number of considered movies: {len(all_movies_ids)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collect titles and ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total time: 26.564144134521484\n"
     ]
    }
   ],
   "source": [
    "t0 = time()\n",
    "fpath = \"./title.ratings.tsv.gz\"\n",
    "df_ratings = pd.read_table(fpath)\n",
    "\n",
    "ind_drop = df_ratings[~df_ratings['tconst'].isin(all_movies_ids)].index\n",
    "df_ratings = df_ratings.drop(ind_drop)\n",
    "assert df_ratings.shape[0] == len(all_movies_ids), print(\"Number of samples less than number of movies\")\n",
    "# print(df_ratings.head())\n",
    "\n",
    "\n",
    "fpath = \"./title.basics.tsv.gz\"\n",
    "\n",
    "df = pd.read_table(fpath, na_values={\"startYear\": [\"\\\\N\"], \"endYear\": [\"\\\\N\"], \"isAdult\": [\"\\\\N\"]})\n",
    "# print(df.head())\n",
    "\n",
    "ind_drop = df[~df['tconst'].isin(all_movies_ids)].index\n",
    "df = df.drop(ind_drop)\n",
    "\n",
    "df = df.merge(df_ratings, on=\"tconst\")\n",
    "\n",
    "df.rename(columns={\"originalTitle\": \"original title\",\n",
    "                   \"primaryTitle\": \"title\",\n",
    "                   \"genres\": \"genre\",\n",
    "                   \"averageRating\": \"imdb_rating\",\n",
    "                   \"tconst\": \"imdb_id\"\n",
    "                  }, inplace=True)\n",
    "\n",
    "# print(df.head())\n",
    "df.drop_duplicates(inplace=True)\n",
    "# print(df.shape)\n",
    "\n",
    "df[\"titleType\"] = df[\"titleType\"].apply(lambda x: \"Series\" if \"Series\" in x else \"\")\n",
    "df[\"genre\"] = [\",\".join([x,y]) if y != \"\" else x for x,y in zip(df[\"genre\"], df[\"titleType\"])]\n",
    "df[\"genre\"] = df[\"genre\"].apply(lambda x: x if x != \"\\\\N\" else \"\")\n",
    "df[\"genre\"] = df[\"genre\"].apply(lambda x: x.split(\",\"))\n",
    "\n",
    "df.fillna({\"startYear\": 0, \"endYear\": 0}, inplace=True)\n",
    "df[\"startYear\"] = df[\"startYear\"].astype(\"int\")\n",
    "df[\"endYear\"] = df[\"endYear\"].astype(\"int\")\n",
    "df.drop([\"titleType\", \"isAdult\", \"runtimeMinutes\"], axis=1, inplace=True)\n",
    "assert df.shape[0] == len(all_movies_ids), print(\"Number of samples less than number of movies\")\n",
    "\n",
    "print(f\"Total time: {time() - t0}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collect names of actors etc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     imdb_id     nconst  ordering         category characters\n",
      "0  tt0000001  nm1588970         1             self   [\"Self\"]\n",
      "1  tt0000001  nm0005690         2         director         \\N\n",
      "2  tt0000001  nm0374658         3  cinematographer         \\N\n",
      "3  tt0000002  nm0721526         1         director         \\N\n",
      "4  tt0000002  nm1335271         2         composer         \\N\n"
     ]
    }
   ],
   "source": [
    "t0 = time()\n",
    "fpath = \"./title.principals.tsv.gz\"\n",
    "\n",
    "df_principals = pd.read_table(fpath)\n",
    "df_principals = df_principals.loc[:, [\"tconst\", \"nconst\", \"ordering\", \"category\", \"characters\"]]\n",
    "df_principals.rename(columns={\"tconst\": \"imdb_id\"}, inplace=True)\n",
    "print(df_principals.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        imdb_id     nconst  ordering             category  \\\n",
      "9953  tt0002130  nm1376180        10  production_designer   \n",
      "9954  tt0002130  nm0660139         1                actor   \n",
      "9955  tt0002130  nm0685283         2                actor   \n",
      "9956  tt0002130  nm0209738         3                actor   \n",
      "9957  tt0002130  nm3942815         4                actor   \n",
      "\n",
      "                                             characters  \n",
      "9953                                                 \\N  \n",
      "9954                                [\"Dante Alighieri\"]  \n",
      "9955                                       [\"Virgilio\"]  \n",
      "9956  [\"Farinata degli Uberti\",\"Pier delle Vigne\",\"I...  \n",
      "9957                               [\"Il conte Ugolino\"]  \n"
     ]
    }
   ],
   "source": [
    "ind_drop = df_principals[~df_principals['imdb_id'].isin(all_movies_ids)].index\n",
    "df_principals = df_principals.drop(ind_drop)\n",
    "print(df_principals.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        imdb_id     nconst  ordering  category  \\\n",
      "9954  tt0002130  nm0660139         1     actor   \n",
      "9955  tt0002130  nm0685283         2     actor   \n",
      "9956  tt0002130  nm0209738         3     actor   \n",
      "9957  tt0002130  nm3942815         4     actor   \n",
      "9958  tt0002130  nm0078205         5  director   \n",
      "\n",
      "                                             characters  \n",
      "9954                                [\"Dante Alighieri\"]  \n",
      "9955                                       [\"Virgilio\"]  \n",
      "9956  [\"Farinata degli Uberti\",\"Pier delle Vigne\",\"I...  \n",
      "9957                               [\"Il conte Ugolino\"]  \n",
      "9958                                                 \\N  \n"
     ]
    }
   ],
   "source": [
    "\n",
    "ind_drop = df_principals[~df_principals['ordering'].isin([1, 2, 3, 4, 5, 6])].index\n",
    "df_principals = df_principals.drop(ind_drop)\n",
    "print(df_principals.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        imdb_id     nconst  ordering  category  \\\n",
      "9954  tt0002130  nm0660139         1     actor   \n",
      "9955  tt0002130  nm0685283         2     actor   \n",
      "9956  tt0002130  nm0209738         3     actor   \n",
      "9957  tt0002130  nm3942815         4     actor   \n",
      "9958  tt0002130  nm0078205         5  director   \n",
      "\n",
      "                                             characters  \n",
      "9954                                [\"Dante Alighieri\"]  \n",
      "9955                                       [\"Virgilio\"]  \n",
      "9956  [\"Farinata degli Uberti\",\"Pier delle Vigne\",\"I...  \n",
      "9957                               [\"Il conte Ugolino\"]  \n",
      "9958                                                 \\N  \n"
     ]
    }
   ],
   "source": [
    "\n",
    "df_principals[\"category\"] = df_principals[\"category\"].apply(lambda x: x if x != \"actress\" else \"actor\")\n",
    "target_profs = [\"director\", \"producer\", \"actor\", \"writer\"] \n",
    "ind_drop = df_principals[~df_principals['category'].isin(target_profs)].index\n",
    "df_principals = df_principals.drop(ind_drop)\n",
    "print(df_principals.head())\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       primaryName     nconst\n",
      "0     Fred Astaire  nm0000001\n",
      "1    Lauren Bacall  nm0000002\n",
      "2  Brigitte Bardot  nm0000003\n",
      "3     John Belushi  nm0000004\n",
      "4   Ingmar Bergman  nm0000005\n"
     ]
    }
   ],
   "source": [
    "\n",
    "fpath = \"./name.basics.tsv.gz\"\n",
    "\n",
    "df_names = pd.read_table(fpath)\n",
    "df_names = df_names.loc[:, [\"primaryName\", \"nconst\"]]\n",
    "print(df_names.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0                                       [\"Dante Alighieri\"]\n",
      "1                                              [\"Virgilio\"]\n",
      "2         [\"Farinata degli Uberti\",\"Pier delle Vigne\",\"I...\n",
      "3                                      [\"Il conte Ugolino\"]\n",
      "4                                                        \\N\n",
      "                                ...                        \n",
      "451193                                                   \\N\n",
      "451194                                            [\"Sinta\"]\n",
      "451195                                           [\"Vikash\"]\n",
      "451196                                             [\"Dewi\"]\n",
      "451197                                                   \\N\n",
      "Name: characters, Length: 451198, dtype: object\n"
     ]
    }
   ],
   "source": [
    "\n",
    "df_principals = df_principals.merge(df_names, on=\"nconst\")\n",
    "print(df_principals[\"characters\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     imdb_id     nconst  ordering  category  \\\n",
      "0  tt0002130  nm0660139         1     actor   \n",
      "1  tt0002130  nm0685283         2     actor   \n",
      "2  tt0002130  nm0209738         3     actor   \n",
      "3  tt0002130  nm3942815         4     actor   \n",
      "4  tt0002130  nm0078205         5  director   \n",
      "\n",
      "                                          characters          primaryName  \n",
      "0                                  [Dante Alighieri]       Salvatore Papa  \n",
      "1                                         [Virgilio]      Arturo Pirovano  \n",
      "2  [Farinata degli Uberti, Pier delle Vigne, Il c...  Giuseppe de Liguoro  \n",
      "3                                 [Il conte Ugolino]     Pier Delle Vigne  \n",
      "4                                                 []  Francesco Bertolini  \n",
      "Total time: 111.03914666175842\n"
     ]
    }
   ],
   "source": [
    "special_char = df_principals.loc[4, \"characters\"]\n",
    "df_principals[\"characters\"] = df_principals[\"characters\"].apply(\n",
    "    lambda x: [] if x == special_char or len(x) == 0 else json.loads(x))\n",
    "print(df_principals.head())\n",
    "\n",
    "print(f\"Total time: {time() - t0}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collect persons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                       directors producers  \\\n",
      "imdb_id                                                      \n",
      "tt0002130  [Francesco Bertolini, Adolfo Padovan]        []   \n",
      "tt0002844                      [Louis Feuillade]        []   \n",
      "tt0003014                      [Victor Sjöström]        []   \n",
      "tt0003037                      [Louis Feuillade]        []   \n",
      "tt0003165                      [Louis Feuillade]        []   \n",
      "\n",
      "                                                      actors          writers  \\\n",
      "imdb_id                                                                         \n",
      "tt0002130  [Salvatore Papa, Arturo Pirovano, Giuseppe de ...               []   \n",
      "tt0002844  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
      "tt0003014  [Hilda Borgström, Georg Grönroos, Aron Lindgre...      [Nils Krok]   \n",
      "tt0003037  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
      "tt0003165  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
      "\n",
      "                                                  characters  \n",
      "imdb_id                                                       \n",
      "tt0002130  [[Dante Alighieri], [Virgilio], [Farinata degl...  \n",
      "tt0002844  [[Fantômas, Gurn], [Inspector Juve], [Jérôme F...  \n",
      "tt0003014  [[Ingeborg Holm], [Poorhouse Superintendant], ...  \n",
      "tt0003037  [[Fantômas, Dr Chaleck, Le Loupart], [Inspecto...  \n",
      "tt0003165  [[Fantômas, le banquier Nanteuil], [Inspector ...  \n",
      "                                       directors producers  \\\n",
      "imdb_id                                                      \n",
      "tt0002130  [Francesco Bertolini, Adolfo Padovan]        []   \n",
      "tt0002844                      [Louis Feuillade]        []   \n",
      "tt0003014                      [Victor Sjöström]        []   \n",
      "tt0003037                      [Louis Feuillade]        []   \n",
      "tt0003165                      [Louis Feuillade]        []   \n",
      "\n",
      "                                                      actors          writers  \\\n",
      "imdb_id                                                                         \n",
      "tt0002130  [Salvatore Papa, Arturo Pirovano, Giuseppe de ...               []   \n",
      "tt0002844  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
      "tt0003014  [Hilda Borgström, Georg Grönroos, Aron Lindgre...      [Nils Krok]   \n",
      "tt0003037  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
      "tt0003165  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
      "\n",
      "                                                  characters  \n",
      "imdb_id                                                       \n",
      "tt0002130  [Dante Alighieri, Virgilio, Farinata degli Ube...  \n",
      "tt0002844  [Fantômas, Gurn, Inspector Juve, Jérôme Fandor...  \n",
      "tt0003014  [Ingeborg Holm, Poorhouse Superintendant, Sven...  \n",
      "tt0003037  [Fantômas, Dr Chaleck, Le Loupart, Inspector J...  \n",
      "tt0003165  [Fantômas, le banquier Nanteuil, Inspector Juv...  \n"
     ]
    }
   ],
   "source": [
    "t0 = time()\n",
    "\n",
    "def collect_movie_persons(x):\n",
    "     return pd.Series({f\"{role}s\": x.loc[x.sort_values(by=[\"ordering\"])[\"category\"] == prof, name].values.tolist()\n",
    "                       for prof, role, name in zip(\n",
    "                           [\"director\", \"producer\", \"actor\", \"writer\", \"actor\"], \n",
    "                           [\"director\", \"producer\", \"actor\", \"writer\", \"character\"], \n",
    "                           [\"primaryName\", \"primaryName\", \"primaryName\", \"primaryName\", \"characters\"]\n",
    "                       )\n",
    "                      })\n",
    "    \n",
    "df_principals = pd.DataFrame(df_principals.groupby('imdb_id').apply(collect_movie_persons))\n",
    "print(df_principals.head())\n",
    "df_principals[\"characters\"] = df_principals[\"characters\"].apply(lambda x: sum(x, []) if isinstance(x, list) else [])\n",
    "print(df_principals.head())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total time: 470.4389228820801\n"
     ]
    }
   ],
   "source": [
    "\n",
    "df.set_index(\"imdb_id\", inplace=True)\n",
    "df = df.join(df_principals, on=\"imdb_id\")\n",
    "df.reset_index(inplace=True)\n",
    "\n",
    "df.fillna(value={f\"{prof}s\": \"\" for prof in target_profs}, inplace=True)\n",
    "\n",
    "assert df.shape[0] == len(all_movies_ids), print(\"Number of samples less than number of movies\")\n",
    "\n",
    "print(f\"Total time: {time() - t0}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collect alternative titles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/dilyara.baymurzina/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2802: DtypeWarning: Columns (7) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  if self.run_code(code, result):\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(81178, 8)\n",
      "        imdb_id  ordering                                      title region  \\\n",
      "6436  tt0002130        15                            Dante's Inferno     US   \n",
      "8439  tt0002844        16              Fantomas: The Beltham Mystery     US   \n",
      "8454  tt0002844         5                          The Phantom Crook     US   \n",
      "8456  tt0002844         7  Fantômas: In the Shadow of the Guillotine     US   \n",
      "8961  tt0003037        12                 Fantomas: The Man in Black     US   \n",
      "\n",
      "     language        types     attributes isOriginalTitle  \n",
      "6436       \\N           \\N             \\N               0  \n",
      "8439       \\N           \\N   review title               0  \n",
      "8454       \\N           \\N  reissue title               0  \n",
      "8456       \\N  imdbDisplay             \\N               0  \n",
      "8961       \\N  imdbDisplay             \\N               0  \n",
      "Total time: 43.85930609703064\n"
     ]
    }
   ],
   "source": [
    "t0 = time()\n",
    "\n",
    "fpath = \"./title.akas.tsv.gz\"\n",
    "\n",
    "df_akas = pd.read_table(fpath)\n",
    "# df_akas = df_akas.loc[:, [\"tconst\", \"nconst\", \"ordering\", \"category\"]]\n",
    "df_akas = df_akas.loc[df_akas[\"region\"] == \"US\", :]\n",
    "df_akas.rename(columns={\"titleId\": \"imdb_id\"}, inplace=True)\n",
    "\n",
    "ind_drop = df_akas[~df_akas['imdb_id'].isin(all_movies_ids)].index\n",
    "df_akas = df_akas.drop(ind_drop)\n",
    "print(df_akas.shape)\n",
    "print(df_akas.head())\n",
    "\n",
    "grouped_data = df_akas.groupby('imdb_id')['title'].apply(lambda x: '::'.join(x))\n",
    "df_titles = pd.DataFrame(grouped_data)\n",
    "df_titles.rename(columns={\"title\": \"all_titles\"}, inplace=True)\n",
    "# df_titles.set_index(\"imdb_id\", inplace=True)\n",
    "\n",
    "df.set_index(\"imdb_id\", inplace=True)\n",
    "df = df.join(df_titles, on=\"imdb_id\")\n",
    "df.reset_index(inplace=True)\n",
    "\n",
    "df.fillna(value={\"all_titles\": \"\"}, inplace=True)\n",
    "\n",
    "assert df.shape[0] == len(all_movies_ids), print(\"Number of samples less than number of movies\")\n",
    "\n",
    "print(f\"Total time: {time() - t0}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>imdb_id</th>\n",
       "      <th>title</th>\n",
       "      <th>original title</th>\n",
       "      <th>startYear</th>\n",
       "      <th>endYear</th>\n",
       "      <th>genre</th>\n",
       "      <th>imdb_rating</th>\n",
       "      <th>numVotes</th>\n",
       "      <th>directors</th>\n",
       "      <th>producers</th>\n",
       "      <th>actors</th>\n",
       "      <th>writers</th>\n",
       "      <th>characters</th>\n",
       "      <th>all_titles</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tt0002130</td>\n",
       "      <td>Dante's Inferno</td>\n",
       "      <td>L'Inferno</td>\n",
       "      <td>1911</td>\n",
       "      <td>0</td>\n",
       "      <td>[Adventure, Drama, Fantasy]</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2987</td>\n",
       "      <td>[Francesco Bertolini, Adolfo Padovan]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Salvatore Papa, Arturo Pirovano, Giuseppe de ...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Dante Alighieri, Virgilio, Farinata degli Ube...</td>\n",
       "      <td>Dante's Inferno</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tt0002844</td>\n",
       "      <td>Fantômas: In the Shadow of the Guillotine</td>\n",
       "      <td>Fantômas - À l'ombre de la guillotine</td>\n",
       "      <td>1913</td>\n",
       "      <td>0</td>\n",
       "      <td>[Crime, Drama]</td>\n",
       "      <td>6.9</td>\n",
       "      <td>2328</td>\n",
       "      <td>[Louis Feuillade]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[René Navarre, Edmund Breon, Georges Melchior,...</td>\n",
       "      <td>[Marcel Allain]</td>\n",
       "      <td>[Fantômas, Gurn, Inspector Juve, Jérôme Fandor...</td>\n",
       "      <td>Fantomas: The Beltham Mystery::The Phantom Cro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>tt0003014</td>\n",
       "      <td>Ingeborg Holm</td>\n",
       "      <td>Ingeborg Holm</td>\n",
       "      <td>1913</td>\n",
       "      <td>0</td>\n",
       "      <td>[Drama]</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1249</td>\n",
       "      <td>[Victor Sjöström]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Hilda Borgström, Georg Grönroos, Aron Lindgre...</td>\n",
       "      <td>[Nils Krok]</td>\n",
       "      <td>[Ingeborg Holm, Poorhouse Superintendant, Sven...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>tt0003037</td>\n",
       "      <td>Fantomas: The Man in Black</td>\n",
       "      <td>Juve contre Fantômas</td>\n",
       "      <td>1913</td>\n",
       "      <td>0</td>\n",
       "      <td>[Crime, Drama]</td>\n",
       "      <td>6.9</td>\n",
       "      <td>1584</td>\n",
       "      <td>[Louis Feuillade]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[René Navarre, Edmund Breon, Georges Melchior,...</td>\n",
       "      <td>[Marcel Allain]</td>\n",
       "      <td>[Fantômas, Dr Chaleck, Le Loupart, Inspector J...</td>\n",
       "      <td>Fantomas: The Man in Black::Fantômas: Juve ver...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>tt0003165</td>\n",
       "      <td>Fantômas: The Dead Man Who Killed</td>\n",
       "      <td>Le mort qui tue</td>\n",
       "      <td>1913</td>\n",
       "      <td>0</td>\n",
       "      <td>[Crime, Drama, Mystery]</td>\n",
       "      <td>6.9</td>\n",
       "      <td>1251</td>\n",
       "      <td>[Louis Feuillade]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[René Navarre, Edmund Breon, Georges Melchior,...</td>\n",
       "      <td>[Marcel Allain]</td>\n",
       "      <td>[Fantômas, le banquier Nanteuil, Inspector Juv...</td>\n",
       "      <td>Fantômas: The Dead Man Who Killed</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     imdb_id                                      title  \\\n",
       "0  tt0002130                            Dante's Inferno   \n",
       "1  tt0002844  Fantômas: In the Shadow of the Guillotine   \n",
       "2  tt0003014                              Ingeborg Holm   \n",
       "3  tt0003037                 Fantomas: The Man in Black   \n",
       "4  tt0003165          Fantômas: The Dead Man Who Killed   \n",
       "\n",
       "                          original title  startYear  endYear  \\\n",
       "0                              L'Inferno       1911        0   \n",
       "1  Fantômas - À l'ombre de la guillotine       1913        0   \n",
       "2                          Ingeborg Holm       1913        0   \n",
       "3                   Juve contre Fantômas       1913        0   \n",
       "4                        Le mort qui tue       1913        0   \n",
       "\n",
       "                         genre  imdb_rating  numVotes  \\\n",
       "0  [Adventure, Drama, Fantasy]          7.0      2987   \n",
       "1               [Crime, Drama]          6.9      2328   \n",
       "2                      [Drama]          7.0      1249   \n",
       "3               [Crime, Drama]          6.9      1584   \n",
       "4      [Crime, Drama, Mystery]          6.9      1251   \n",
       "\n",
       "                               directors producers  \\\n",
       "0  [Francesco Bertolini, Adolfo Padovan]        []   \n",
       "1                      [Louis Feuillade]        []   \n",
       "2                      [Victor Sjöström]        []   \n",
       "3                      [Louis Feuillade]        []   \n",
       "4                      [Louis Feuillade]        []   \n",
       "\n",
       "                                              actors          writers  \\\n",
       "0  [Salvatore Papa, Arturo Pirovano, Giuseppe de ...               []   \n",
       "1  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
       "2  [Hilda Borgström, Georg Grönroos, Aron Lindgre...      [Nils Krok]   \n",
       "3  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
       "4  [René Navarre, Edmund Breon, Georges Melchior,...  [Marcel Allain]   \n",
       "\n",
       "                                          characters  \\\n",
       "0  [Dante Alighieri, Virgilio, Farinata degli Ube...   \n",
       "1  [Fantômas, Gurn, Inspector Juve, Jérôme Fandor...   \n",
       "2  [Ingeborg Holm, Poorhouse Superintendant, Sven...   \n",
       "3  [Fantômas, Dr Chaleck, Le Loupart, Inspector J...   \n",
       "4  [Fantômas, le banquier Nanteuil, Inspector Juv...   \n",
       "\n",
       "                                          all_titles  \n",
       "0                                    Dante's Inferno  \n",
       "1  Fantomas: The Beltham Mystery::The Phantom Cro...  \n",
       "2                                                     \n",
       "3  Fantomas: The Man in Black::Fantômas: Juve ver...  \n",
       "4                  Fantômas: The Dead Man Who Killed  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "94733\n"
     ]
    }
   ],
   "source": [
    "database = df.to_dict(\"records\")\n",
    "for el in database:\n",
    "    el[\"genre\"] = el[\"genre\"] if el[\"genre\"] != \"\" else None\n",
    "    el[\"startYear\"] = el[\"startYear\"] if el[\"startYear\"] != 0 else None\n",
    "    el[\"endYear\"] = el[\"endYear\"] if el[\"endYear\"] != 0 else None\n",
    "    el[\"all_titles\"] = el[\"all_titles\"].split(\"::\") if el[\"all_titles\"] != \"\" else []\n",
    "    for prof in [\"director\", \"producer\", \"actor\", \"writer\"]:\n",
    "        el[f\"{prof}s\"] = list(el[f\"{prof}s\"]) if list(el[f\"{prof}s\"]) != \"\" else []\n",
    "\n",
    "print(len(database))\n",
    "with open(\"database_most_popular_main_info.json\", \"w\") as f:\n",
    "    json.dump(database, f, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'actors': ['Rudolph Valentino', 'Lila Lee', 'Nita Naldi', 'George Field'],\n",
       " 'all_titles': ['Blood and Sand'],\n",
       " 'characters': ['Juan Gallardo', 'Carmen', 'Doña Sol', 'El Nacional'],\n",
       " 'directors': ['Fred Niblo', 'Dorothy Arzner'],\n",
       " 'endYear': None,\n",
       " 'genre': ['Drama', 'Romance', 'Sport'],\n",
       " 'imdb_id': 'tt0012952',\n",
       " 'imdb_rating': 6.4,\n",
       " 'numVotes': 1434,\n",
       " 'original title': 'Blood and Sand',\n",
       " 'producers': [],\n",
       " 'startYear': 1922,\n",
       " 'title': 'Blood and Sand',\n",
       " 'writers': []}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "database[100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
